In [1]:
#Import required packages
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
In [2]:
def format_date(df_date):
"""
Splits Meeting Times and Dates into datetime objects where applicable using regex.
"""
df_date['Days'] = df_date['Meeting_Times'].str.extract('([^\s]+)', expand=True)
df_date['Start_Date'] = df_date['Meeting_Dates'].str.extract('([^\s]+)', expand=True)
df_date['Year'] = df_date['Term'].astype(str).str.slice(0,4)
df_date['Quarter'] = df_date['Term'].astype(str).str.slice(4,6)
df_date['Term_Date'] = pd.to_datetime(df_date['Year'] + df_date['Quarter'], format='%Y%m')
#df_date['Start_Month'] = pd.to_datetime(df_date['Year'] + df_date['Start_Date'], format='%Y%b')
df_date['End_Date'] = df_date['Meeting_Dates'].str.extract('(?<=-)(.*)(?= )', expand=True)
#df_date['End_Month'] = pd.to_datetime(df_date['End_Date'], format='%b')
df_date['Start_Time'] = df_date['Meeting_Times'].str.extract('(?<= )(.*)(?=-)', expand=True)
df_date['Start_Time'] = pd.to_datetime(df_date['Start_Time'], format='%H%M')
df_date['End_Time'] = df_date['Meeting_Times'].str.extract('((?<=-).*$)', expand=True)
df_date['End_Time'] = pd.to_datetime(df_date['End_Time'], format='%H%M')
df_date['Duration_Hr'] = ((df_date['End_Time'] - df_date['Start_Time']).dt.seconds)/3600
#df_date = df_date.set_index(pd.DatetimeIndex(df_date['Term_Date']))
return df_date
def format_xlist(df_xl):
"""
revises % capacity calculations by using Max Enrollment instead of room capacity.
"""
df_xl['Cap_Diff'] = np.where(df_xl['Xlst'] != '',
df_xl['Max_Enrl'].astype(int) - df_xl['Actual_Enrl'].astype(int),
df_xl['Room_Capacity'].astype(int) - df_xl['Actual_Enrl'].astype(int))
df_xl = df_xl.loc[df_xl['Room_Capacity'].astype(int) < 999]
return df_xl
In [3]:
"""
Main program control flow.
"""
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
df = pd.read_csv('data/PSU_master_classroom.csv', dtype={'Schedule': object, 'Schedule Desc': object})
df = df.fillna('')
df = format_date(df)
# Avoid classes that only occur on a single day
df = df.loc[df['Start_Date'] != df['End_Date']]
df = df.loc[df['Online Instruct Method'] != 'Fully Online']
# Calculate number of days per week and treat Sunday condition
df['Days_Per_Week'] = df['Days'].str.len()
df['Room_Capacity'] = df['Room_Capacity'].apply(lambda x: x if (x != 'No Data Available') else 0)
df_cl = format_xlist(df)
In [4]:
# Map and Enumerate
from sklearn.preprocessing import LabelEncoder
cat_columns = ['Dept', 'Class', 'Meeting_Times', 'ROOM' ]
for column in cat_columns:
col_mapping = {label: idx for idx, label in enumerate(np.unique(df_cl['{0}'.format(column)]))}
df_cl['{0}'.format(column)] = df_cl['{0}'.format(column)].map(col_mapping)
In [5]:
X = df_cl[['Dept', 'Term', 'Class', 'Meeting_Times', 'ROOM']].values
df_cl_le = LabelEncoder()
X[:, 0] = df_cl_le.fit_transform(X[:, 0])
X
Out[5]:
In [6]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()
pd.get_dummies(df[['Dept', 'Term', 'Class', 'Meeting_Times', 'ROOM']])
Out[6]: